{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "import pyprind\n",
    "import pandas as pd\n",
    "import os\n",
    "from nltk.corpus import stopwords\n",
    "import re\n",
    "import numpy as np\n",
    "\n",
    "stop = stopwords.words('english')\n",
    "\n",
    "def tokenizer(text):\n",
    "    text = re.sub('<[^>]*>', '', text)\n",
    "    emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text.lower())\n",
    "    text = re.sub('[\\W]+', ' ', text.lower()) +\\\n",
    "        ' '.join(emoticons).replace('-', '')\n",
    "    tokenized = [w for w in text.split() if w not in stop]\n",
    "    tokenized = text.split()\n",
    "    return tokenized\n",
    "\n",
    "train_data_path = 'data/labeledTrainData.tsv'\n",
    "df = pd.read_csv(train_data_path, sep=\"\\t\")\n",
    "df[\"review\"] = df[\"review\"].apply(lambda x : tokenizer(x))\n",
    "df.to_csv('data/movie_data.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 生成词向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "0% [#######                       ] 100% | ETA: 00:00:14"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocabulary building finished, start training...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "0% [###############               ] 100% | ETA: 00:00:12"
     ]
    }
   ],
   "source": [
    "import pyprind\n",
    "import gensim.models\n",
    "import re\n",
    "\n",
    "inpath = 'data/movie_data.csv'\n",
    "outpath = 'results/wordVectTrainResult'\n",
    "pbar = pyprind.ProgBar(100000)\n",
    "class csvStream(object):\n",
    "    def __init__(self,path):\n",
    "        self.path=path\n",
    "    def __iter__(self):\n",
    "        with open(self.path, 'r',) as csv:\n",
    "            next(csv)  # skip header\n",
    "            for line in csv:\n",
    "                text = line[4:-3]\n",
    "                text = re.sub('[\\'\\\"\\[\\]\\d\\b]','',text)   \n",
    "                while (text[0] == ',') or (text[0] == ' '):\n",
    "                    text = text[1:]\n",
    "                pbar.update()\n",
    "                yield text.split(', ')\n",
    "\n",
    "\n",
    "lineIterator = csvStream(inpath)\n",
    "model = gensim.models.Word2Vec()\n",
    "model.build_vocab(lineIterator)\n",
    "print('vocabulary building finished, start training...')\n",
    "model.train(lineIterator,total_examples=model.corpus_count,epochs=1)\n",
    "model.save(outpath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 词向量测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 100 100\n",
      "0.9167262516432011\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/hz/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:23: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "/Users/hz/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:24: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "/Users/hz/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:25: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n"
     ]
    }
   ],
   "source": [
    "import gensim.models\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "def cos_sim(vector_a, vector_b):\n",
    "    \"\"\"\n",
    "    计算两个向量之间的余弦相似度\n",
    "    :param vector_a: 向量 a \n",
    "    :param vector_b: 向量 b\n",
    "    :return: sim\n",
    "    \"\"\"\n",
    "    vector_a = np.mat(vector_a)\n",
    "    vector_b = np.mat(vector_b)\n",
    "    num = float(vector_a * vector_b.T)\n",
    "    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)\n",
    "    cos = num / denom\n",
    "    sim = 0.5 + 0.5 * cos\n",
    "    return sim\n",
    "\n",
    "\n",
    "inpath = 'results/wordVectTrainResult'\n",
    "model = gensim.models.Word2Vec.load(inpath)\n",
    "test1 = model[\"good\"]\n",
    "test2 = model[\"nice\"]\n",
    "test3 = model[\"go\"]\n",
    "\n",
    "\n",
    "print(len(test1), len(test2),len(test3))\n",
    "print(cos_sim(test1,test2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 分类测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/hz/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:15: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  from ipykernel import kernelapp as app\n",
      "/Users/hz/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:17: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         id  sentiment                                             review  len\n",
      "0  12311_10          0  [-0.19330275617890003, 0.10081300452282421, 0....  100\n",
      "1    8348_2          0  [-0.07429290727502805, 0.12601268771738255, 0....  100\n",
      "2    5828_4          0  [0.11162658395491294, 0.00733450739030409, 0.3...  100\n",
      "3    7186_2          0  [-0.0369541522263067, 0.16518554098105856, 0.3...  100\n",
      "4   12128_7          0  [-0.09854314756706696, 0.14072058940038198, 0....  100\n",
      "5    2913_8          0  [0.034660419103091776, 0.05697532973075008, 0....  100\n",
      "6    4396_1          0  [0.17724977271983355, 0.12278373242634547, 0.4...  100\n",
      "7     395_2          0  [0.1945963272358248, 0.16014676212107132, 0.41...  100\n",
      "8   10616_1          0  [0.11036306295208022, -0.05364430498820348, 0....  100\n",
      "9    9074_9          0  [-0.0893094160754835, -0.01371210671004927, 0....  100\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import linear_model\n",
    "\n",
    "df1 = pd.read_csv(\"data/sampleSubmission.csv\", sep=\",\")\n",
    "df2 = pd.read_csv(\"data/testData.tsv\", sep=\"\\t\")\n",
    "df = pd.merge(df1, df2, on=\"id\", how=\"inner\")\n",
    "\n",
    "inpath = 'results/wordVectTrainResult'\n",
    "model = gensim.models.Word2Vec.load(inpath)\n",
    "\n",
    "def get_text_vector(text):\n",
    "    res = np.zeros([100])\n",
    "    for item in text:\n",
    "        if item not in model:\n",
    "            continue\n",
    "        res += model[item]\n",
    "    return res / len(text)\n",
    "\n",
    "df[\"review\"] = df[\"review\"].apply(lambda x : tokenizer(x))\n",
    "df[\"review\"] = df[\"review\"].apply(lambda x : get_text_vector(x))\n",
    "df[\"len\"] = df[\"review\"].apply(lambda x : len(x))\n",
    "\n",
    "print(df[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.neural_network import MLPClassifier\n",
    "\n",
    "X = []\n",
    "for item in df[[\"review\"]].values.ravel():\n",
    "    X.append(item)\n",
    "\n",
    "y = df[\"sentiment\"].values.ravel()\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)\n",
    "\n",
    "clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,), random_state=1)\n",
    "clf.fit(X_train, y_train)\n",
    "\n",
    "clf.predict_proba(X_test[:1])\n",
    "\n",
    "clf.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000,)"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a=df[[\"review\"]].values.ravel()\n",
    "a.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "cannot reshape array of size 25000 into shape (25000,100)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-119-c9944cedd54d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m25000\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m: cannot reshape array of size 25000 into shape (25000,100)"
     ]
    }
   ],
   "source": [
    "a.reshape(25000,100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
